# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
Load in your dataset and describe its properties through the questions below. Try and motivate your exploration goals through this section.
df = pd.read_csv('201902-fordgobike-tripdata.csv') #reading data
df.shape
(183412, 16)
df = df.dropna() #drop null
df.shape
(174952, 16)
df.head()
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | end_station_longitude | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 2019-02-28 17:32:10.1450 | 2019-03-01 08:01:55.9750 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 13.0 | Commercial St at Montgomery St | 37.794231 | -122.402923 | 4902 | Customer | 1984.0 | Male | No |
| 2 | 61854 | 2019-02-28 12:13:13.2180 | 2019-03-01 05:24:08.1460 | 86.0 | Market St at Dolores St | 37.769305 | -122.426826 | 3.0 | Powell St BART Station (Market St at 4th St) | 37.786375 | -122.404904 | 5905 | Customer | 1972.0 | Male | No |
| 3 | 36490 | 2019-02-28 17:54:26.0100 | 2019-03-01 04:02:36.8420 | 375.0 | Grove St at Masonic Ave | 37.774836 | -122.446546 | 70.0 | Central Ave at Fell St | 37.773311 | -122.444293 | 6638 | Subscriber | 1989.0 | Other | No |
| 4 | 1585 | 2019-02-28 23:54:18.5490 | 2019-03-01 00:20:44.0740 | 7.0 | Frank H Ogawa Plaza | 37.804562 | -122.271738 | 222.0 | 10th Ave at E 15th St | 37.792714 | -122.248780 | 4898 | Subscriber | 1974.0 | Male | Yes |
| 5 | 1793 | 2019-02-28 23:49:58.6320 | 2019-03-01 00:19:51.7600 | 93.0 | 4th St at Mission Bay Blvd S | 37.770407 | -122.391198 | 323.0 | Broadway at Kearny | 37.798014 | -122.405950 | 5200 | Subscriber | 1959.0 | Male | No |
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 174952 entries, 0 to 183411 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 174952 non-null int64 1 start_time 174952 non-null object 2 end_time 174952 non-null object 3 start_station_id 174952 non-null float64 4 start_station_name 174952 non-null object 5 start_station_latitude 174952 non-null float64 6 start_station_longitude 174952 non-null float64 7 end_station_id 174952 non-null float64 8 end_station_name 174952 non-null object 9 end_station_latitude 174952 non-null float64 10 end_station_longitude 174952 non-null float64 11 bike_id 174952 non-null int64 12 user_type 174952 non-null object 13 member_birth_year 174952 non-null float64 14 member_gender 174952 non-null object 15 bike_share_for_all_trip 174952 non-null object dtypes: float64(7), int64(2), object(7) memory usage: 22.7+ MB
#changing data type
df.start_time = pd.to_datetime(df.start_time)
df.end_time = pd.to_datetime(df.end_time)
df.start_station_name = df['start_station_name'].astype(str)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 174952 entries, 0 to 183411 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 174952 non-null int64 1 start_time 174952 non-null datetime64[ns] 2 end_time 174952 non-null datetime64[ns] 3 start_station_id 174952 non-null float64 4 start_station_name 174952 non-null object 5 start_station_latitude 174952 non-null float64 6 start_station_longitude 174952 non-null float64 7 end_station_id 174952 non-null float64 8 end_station_name 174952 non-null object 9 end_station_latitude 174952 non-null float64 10 end_station_longitude 174952 non-null float64 11 bike_id 174952 non-null int64 12 user_type 174952 non-null object 13 member_birth_year 174952 non-null float64 14 member_gender 174952 non-null object 15 bike_share_for_all_trip 174952 non-null object dtypes: datetime64[ns](2), float64(7), int64(2), object(5) memory usage: 22.7+ MB
df.head()
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | end_station_longitude | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 2019-02-28 17:32:10.145 | 2019-03-01 08:01:55.975 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 13.0 | Commercial St at Montgomery St | 37.794231 | -122.402923 | 4902 | Customer | 1984.0 | Male | No |
| 2 | 61854 | 2019-02-28 12:13:13.218 | 2019-03-01 05:24:08.146 | 86.0 | Market St at Dolores St | 37.769305 | -122.426826 | 3.0 | Powell St BART Station (Market St at 4th St) | 37.786375 | -122.404904 | 5905 | Customer | 1972.0 | Male | No |
| 3 | 36490 | 2019-02-28 17:54:26.010 | 2019-03-01 04:02:36.842 | 375.0 | Grove St at Masonic Ave | 37.774836 | -122.446546 | 70.0 | Central Ave at Fell St | 37.773311 | -122.444293 | 6638 | Subscriber | 1989.0 | Other | No |
| 4 | 1585 | 2019-02-28 23:54:18.549 | 2019-03-01 00:20:44.074 | 7.0 | Frank H Ogawa Plaza | 37.804562 | -122.271738 | 222.0 | 10th Ave at E 15th St | 37.792714 | -122.248780 | 4898 | Subscriber | 1974.0 | Male | Yes |
| 5 | 1793 | 2019-02-28 23:49:58.632 | 2019-03-01 00:19:51.760 | 93.0 | 4th St at Mission Bay Blvd S | 37.770407 | -122.391198 | 323.0 | Broadway at Kearny | 37.798014 | -122.405950 | 5200 | Subscriber | 1959.0 | Male | No |
index_names = df[ df['member_gender'] == "Other" ].index
df.drop(index_names, inplace = True)
df.member_gender.value_counts()
Male 130500 Female 40805 Name: member_gender, dtype: int64
df['member_age'] = 2022 - df['member_birth_year']
data=df
data['duration_min'] = data['duration_sec']/60
the structure of data has 16 features and 183412 trip those are (duration_sec, start_time, end_time, start_station_id, start_station_name, start_station_latitude, start_station_longitude, end_station_id, end_station_name, end_station_latitude ,end_station_longitude, bike_id, user_type, member_birth_year, member_gender, bike_share_for_all_trip).
I had¶
1- drop null vlaues¶
2- scaling duration in seconds to duration in minutes¶
3- dropping [other] type of gender for simplecity¶
4- scaling [member_birth_year] feature to [member_age]¶
duration of the trip is the main feature interest with thw aid of other features
In this section, investigate distributions of individual variables. If you see unusual points or outliers, take a deeper look to clean things up and prepare yourself to look at relationships between variables.
plt.xlabel('duration in minutes');
plt.ylabel('frequency');
plt.title('the frequency distribution of the trip duration in minutes');
plt.hist(data = df, x= "duration_min")
(array([1.70907e+05, 2.13000e+02, 6.10000e+01, 2.80000e+01, 2.10000e+01,
1.90000e+01, 1.90000e+01, 1.90000e+01, 1.20000e+01, 6.00000e+00]),
array([1.01666667e+00, 1.41828333e+02, 2.82640000e+02, 4.23451667e+02,
5.64263333e+02, 7.05075000e+02, 8.45886667e+02, 9.86698333e+02,
1.12751000e+03, 1.26832167e+03, 1.40913333e+03]),
<BarContainer object of 10 artists>)
bins = np.arange(0, 100, 1)
ticks = np.arange(0, 100, 5)
plt.hist(data=data, x='duration_min', bins=bins);
plt.xticks(ticks, ticks);
plt.xlabel('Trip Duration in Minute');
plt.ylabel('frequency');
plt.title('the frequency distribution of the trip duration in minutes (limeted to 100)');
df.start_station_name.value_counts()
Market St at 10th St 3576
San Francisco Caltrain Station 2 (Townsend St at 4th St) 3370
Berry St at 4th St 2910
Montgomery St BART Station (Market St at 2nd St) 2643
Powell St BART Station (Market St at 4th St) 2576
...
Willow St at Vine St 9
Parker Ave at McAllister St 6
21st Ave at International Blvd 4
Palm St at Willow St 3
16th St Depot 2
Name: start_station_name, Length: 329, dtype: int64
df.start_station_name.value_counts()[:60].plot.pie(figsize=(12,18), fontsize= 100, autopct='%.0f%%',
radius = 6,rotatelabels=True);
plt.title('Percentage of crowd \n for the top 60 stations', fontdict = {'fontsize' : 130})
plt.show()
plt.hist(data= df , x = "member_gender")
plt.xlabel('Gender');
plt.ylabel('Count');
plt.title('bike demanding figure shows difference of rides number between gender');
df.start_time.dt.strftime('%H').sort_values().hist(figsize=(7,7))
plt.xlabel('start hour');
plt.ylabel('frequency');
plt.title('the frequency distribution start hour of the trip');
df['member_age'] = df['member_age'].astype('int')
bins = np.arange(0, data['member_age'].max()+5, 5)
plt.hist(data=df, x='member_age',bins=bins)
plt.xlabel('frequency');
plt.ylabel('Age');
plt.title('the frequency distribution of user ages');
Make sure that, after every plot or related series of plots, that you include a Markdown cell with comments about what you observed, and what you plan on investigating next.
my main intrest is the trip duration, since the most of trips were around 100 minutes so I focused under the range of <100 minute and for sure transformed the data scale to minute since the second unit is not clear enough.
the most of the trips were around 5:15 minutes
the rush our concluded by the frequency figures shows that the peak time were at the official work and study start-time & end-time
the start station were 329 and the most dense were at the top only so I focused on the top 60 the cocluded data shows that the most users are males As expected the users ages distribution shows the most were from 20:40 and this shows that workers are the major clients
In this section, investigate relationships between pairs of variables in your data. Make sure the variables that you cover here have been introduced in some fashion in the previous section (univariate exploration).
x = df.start_time.dt.strftime('%H').value_counts()[:6] #counts the rush hours only
x
17 20497 08 19881 18 15785 09 14907 16 13196 07 10055 Name: start_time, dtype: int64
x = df.start_time.dt.strftime('%H').value_counts()[:6] # getting the rows with station name @start time only for rush hours
peak = df[ (df.start_time.dt.strftime('%H') == '17')|\
(df.start_time.dt.strftime('%H') == '08')|\
(df.start_time.dt.strftime('%H') == '18')|\
(df.start_time.dt.strftime('%H') == '09')|\
(df.start_time.dt.strftime('%H') == '16')|\
(df.start_time.dt.strftime('%H') == '07')][['start_station_name', 'start_time']]
peak.start_station_name.groupby(peak.start_time.dt.strftime('%H')).value_counts()
start_time start_station_name
07 San Francisco Caltrain Station 2 (Townsend St at 4th St) 397
San Francisco Caltrain (Townsend St at 4th St) 241
Howard St at Beale St 200
Beale St at Harrison St 183
San Francisco Ferry Building (Harry Bridges Plaza) 177
...
18 Farnam St at Fruitvale Ave 1
George St at 1st St 1
Leavenworth St at Broadway 1
San Pedro St at Hedding St 1
Williams Ave at 3rd St 1
Name: start_station_name, Length: 1879, dtype: int64
peak_time = peak[peak.start_time.dt.strftime('%H') == '07'].start_station_name.value_counts()
peak_time[:60].plot.pie(figsize=(12,18), fontsize= 100, autopct='%.0f%%',radius = 6,rotatelabels=True);
plt.title('percentage of crowd \n at top 60 stations \n at rush hours', fontdict = {'fontsize' : 110})
plt.show()
sb.boxplot(data=df, x='member_gender', y='duration_sec')
plt.xlabel('Gender');
plt.ylabel('Trip Duration in Seconds');
plt.ylim([0, 2000])
plt.title('the trip duration according to agent gender')
Text(0.5, 1.0, 'the trip duration according to agent gender')
number male riders tend to have shorter trips compared to female users
df.member_gender.groupby(df.bike_share_for_all_trip).hist()
plt.legend(['No','Yes'])
plt.xlabel('gender');
plt.ylabel('frequency');
plt.title('the frequency distribution share trip acceptance according to gender');
the result shown is expected since male user tends to be more familiar with another user
day = df.start_time.dt.strftime('%A') #divide the start time in day unit
sb.barplot( x=day , y=df.duration_sec)
plt.title('average trip duration in week days');
The riding trips are much shorter on Monday through Friday compared to weekends. It indicates a pretty stable and efficient usage of the sharing system on normal work days, while more casual flexible use on weekends.
Create plots of three or more variables to investigate your data even further. Make sure that your investigations are justified, and follow from your work in the previous sections.
data = df #copy data
data.member_age.value_counts() #check the frequency of member age
34 10015
29 9145
33 8805
32 8495
31 8339
...
102 3
78 2
88 2
95 1
144 1
Name: member_age, Length: 72, dtype: int64
# filter out outlier ages from visually examination of the count above
data = data.query('member_age <= 85')
# transforming the type of member age to integer
data['member_age'] = data['member_age'].astype('int')
<ipython-input-151-792b91a54179>:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
data['member_age'] = data['member_age'].astype('int')
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 171119 entries, 0 to 183411 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 171119 non-null int64 1 start_time 171119 non-null datetime64[ns] 2 end_time 171119 non-null datetime64[ns] 3 start_station_id 171119 non-null float64 4 start_station_name 171119 non-null object 5 start_station_latitude 171119 non-null float64 6 start_station_longitude 171119 non-null float64 7 end_station_id 171119 non-null float64 8 end_station_name 171119 non-null object 9 end_station_latitude 171119 non-null float64 10 end_station_longitude 171119 non-null float64 11 bike_id 171119 non-null int64 12 user_type 171119 non-null object 13 member_birth_year 171119 non-null float64 14 member_gender 171119 non-null object 15 bike_share_for_all_trip 171119 non-null object 16 member_age 171119 non-null int32 17 duration_min 171119 non-null float64 dtypes: datetime64[ns](2), float64(8), int32(1), int64(2), object(5) memory usage: 24.2+ MB
# add new columns for trip duration in minute, trip start date in yyyy-mm-dd format,
# trip start hour of the day, day of week
data['start_date'] = data.start_time.dt.strftime('%Y-%m-%d')
data['start_hour'] = data.start_time.dt.strftime('%H')
data['start_day'] = data.start_time.dt.strftime('%A')
<ipython-input-153-5c7b169d1f0b>:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
data['start_date'] = data.start_time.dt.strftime('%Y-%m-%d')
<ipython-input-153-5c7b169d1f0b>:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
data['start_hour'] = data.start_time.dt.strftime('%H')
<ipython-input-153-5c7b169d1f0b>:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
data['start_day'] = data.start_time.dt.strftime('%A')
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 171119 entries, 0 to 183411 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 171119 non-null int64 1 start_time 171119 non-null datetime64[ns] 2 end_time 171119 non-null datetime64[ns] 3 start_station_id 171119 non-null float64 4 start_station_name 171119 non-null object 5 start_station_latitude 171119 non-null float64 6 start_station_longitude 171119 non-null float64 7 end_station_id 171119 non-null float64 8 end_station_name 171119 non-null object 9 end_station_latitude 171119 non-null float64 10 end_station_longitude 171119 non-null float64 11 bike_id 171119 non-null int64 12 user_type 171119 non-null object 13 member_birth_year 171119 non-null float64 14 member_gender 171119 non-null object 15 bike_share_for_all_trip 171119 non-null object 16 member_age 171119 non-null int32 17 duration_min 171119 non-null float64 18 start_date 171119 non-null object 19 start_hour 171119 non-null object 20 start_day 171119 non-null object dtypes: datetime64[ns](2), float64(8), int32(1), int64(2), object(8) memory usage: 28.1+ MB
plt.suptitle('Hourly Usage during Weekdays for Customers and Subscribers')
customers = data.query('member_age < 45 & member_age > 20')
ct_counts = customers.groupby(['start_day', 'start_hour']).size()
ct_counts = ct_counts.reset_index(name='count')
ct_counts = ct_counts.pivot(index='start_day', columns='start_hour', values='count')
sb.heatmap(ct_counts, cmap='rocket_r');
plt.title('member age', loc='right');
plt.xlabel('Hour of Day');
plt.ylabel('Day of Week');
gender_mark = [['Male', 'v'],['Female', 's']]
for gender, marker in gender_mark:
df_gender = data[data['member_gender'] == gender]
plt.scatter(( df_gender['member_age']), df_gender['duration_min'], marker = marker, alpha= 1)
plt.legend(['Male','Female'])
plt.axis([10, 80, 0, 60])
plt.title('the Trip Durationw with Gender and Age')
plt.xlabel('Age')
plt.ylabel('Duration in minute')
plt.show()
the top crowded station at rush hours varing from the the crowded station during the day and I conclude that people avoid crowds at peak times unless the trip is a working or study trip This conclusion is very useful Where the company will have to set a larger number of trips in specific places at rush times and different times in the rest of the day
At the end of your report, make sure that you export the notebook as an html file from the
File > Download as... > HTMLmenu. Make sure you keep track of where the exported file goes, so you can put it in the same folder as this notebook for project submission. Also, make sure you remove all of the quote-formatted guide notes like this one before you finish your report!